// Copyright 2020 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
#define COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_

#include <map>
#include <memory>
#include <string>

#include "base/containers/flat_set.h"
#include "base/memory/raw_ptr.h"
#include "base/strings/string_piece_forward.h"

#include "third_party/icu/source/common/unicode/uniset.h"

// 'icu' does not work. Use U_ICU_NAMESPACE.
namespace U_ICU_NAMESPACE {

class Transliterator;

}  // namespace U_ICU_NAMESPACE

struct USpoofChecker;

using Skeletons = base::flat_set<std::string>;
using SkeletonMap = std::map<char16_t, Skeletons>;

// This class generates skeleton strings from hostnames. Skeletons are a
// transformation of the input hostname. Two hostnames are confusable if their
// skeletons are identical. See http://unicode.org/reports/tr39/ for more
// information.
//
// Transformation of a hostname to its skeleton strings happens in multiple
// steps:
// 1. The hostname is "normalized" by removing its diacritics. This is done so
//    that more confusable hostnames can be detected than would be using the
//    plain ICU API.
// 2. Supplemental hostname strings are generated from the normalized hostname
//    using a manually curated "multiple skeleton" table. This table has a
//    one-to-many relationship between characters and their skeletons. The
//    number of skeletons generated by this step is capped to a maximum number.
//    This step is done before ICU's skeleton generation (which is many-to-one)
//    so that we can generate more supplemental hostnames. For example, ICU
//    maps "œ" to "oe". Since the character "œ" won't appear in the ICU
//    skeleton, we can't produce supplemental skeletons for it. Therefore, we
//    must map it to "oe" and "ce" before skeleton generation.
// 3. For each supplemental hostname, the following steps are performed:
// 4. Certain characters in the hostname are mapped to their confusable
//    equivalents using a manually curated table (extra confusible mapper). This
//    table has a many-to-one relationship between characters and their
//    skeletons. For example, the characters є, ҽ, ҿ, and ၔ are all
//    mapped to Latin lowercase e.
// 5. The hostname is passed to ICU to generate actual skeleton strings.
// 6. If the character U+04CF (ӏ) is present in the skeleton, another skeleton
//    is generated by mapping it to lowercase L (U+6C).
// 7. The final output is a Skeletons instance which contains one or more
//    skeleton strings that represent the input hostname.
class SkeletonGenerator {
 public:
  explicit SkeletonGenerator(const USpoofChecker* checker);
  ~SkeletonGenerator();

  // Returns the set of skeletons for the |hostname|. For IDN, |hostname| must
  // already be decoded to unicode.
  Skeletons GetSkeletons(base::StringPiece16 hostname);

  // Returns true if the diacritics should be removed from |label|. Diacritic
  // removal is a slow operation and should be avoided when possible.
  bool ShouldRemoveDiacriticsFromLabel(const icu::UnicodeString& label) const;

  // Removes diacritics from |hostname| and returns the new string if the input
  // only contains Latin-Greek-Cyrillic characters. Otherwise, returns the
  // input string.
  std::u16string MaybeRemoveDiacritics(base::StringPiece16 hostname);

  // Returns the set of alternative strings using the one-to-many string
  // mapping provided in `mapping`. Generates at most `max_alternatives` strings
  // from the input string.
  static base::flat_set<std::u16string> GenerateSupplementalHostnames(
      base::StringPiece16 input,
      size_t max_alternatives,
      const SkeletonMap& mapping);

 private:
  // Adds an additional mapping from |src_char| to |mapped_char| when generating
  // skeletons: If |host| contains |src_char|, |skeletons| will contain a new
  // skeleton where all occurances of |src_char| are replaced with
  // |mapped_char|.
  void AddSkeletonMapping(const icu::UnicodeString& host,
                          int32_t src_char,
                          int32_t mapped_char,
                          Skeletons* skeletons);
  void MaybeRemoveDiacritics(icu::UnicodeString& hostname);
  // Returns true if supplemental hostnames of `input_hostname` should be
  // generated without removing its diacritics.
  bool ShouldComputeSupplementalHostnamesWithDiacritics(
      base::StringPiece16 input_hostname) const;

  icu::UnicodeSet lgc_letters_n_ascii_;

  std::unique_ptr<icu::Transliterator> diacritic_remover_;
  std::unique_ptr<icu::Transliterator> extra_confusable_mapper_;

  // Map of characters to their skeletons. This map is manually curated.
  std::map<char16_t, Skeletons> character_map_;
  // Contains the characters from character_map_ that have diacritics. This is
  // used to determine if we should compute supplemental hostnames for a
  // hostname without removing its diacritics.
  base::flat_set<char16_t> characters_with_multiple_skeletons_with_diacritics_;

  raw_ptr<const USpoofChecker> checker_;
};

#endif  // COMPONENTS_URL_FORMATTER_SPOOF_CHECKS_SKELETON_GENERATOR_H_
